Investigate seasonality and long-term trends for each variable
# Load libraries
library(RCurl)
library(tidyverse)
library(magrittr)
library(plotly)
library(imputeTS)
library(lubridate)
library(timetk)
# Load data (relative file path) and view contents
df <- read.csv("../data/energy_dataset.csv")
head(df)
dim(df) # Number of rows and columns
## [1] 35064 29
summary(df) # Distribution and missingness for each variable
## time generation.biomass generation.fossil.brown.coal.lignite
## Length:35064 Min. : 0.0 Min. : 0.0
## Class :character 1st Qu.:333.0 1st Qu.: 0.0
## Mode :character Median :367.0 Median :509.0
## Mean :383.5 Mean :448.1
## 3rd Qu.:433.0 3rd Qu.:757.0
## Max. :592.0 Max. :999.0
## NA's :19 NA's :18
## generation.fossil.coal.derived.gas generation.fossil.gas
## Min. :0 Min. : 0
## 1st Qu.:0 1st Qu.: 4126
## Median :0 Median : 4969
## Mean :0 Mean : 5623
## 3rd Qu.:0 3rd Qu.: 6429
## Max. :0 Max. :20034
## NA's :18 NA's :18
## generation.fossil.hard.coal generation.fossil.oil generation.fossil.oil.shale
## Min. : 0 Min. : 0.0 Min. :0
## 1st Qu.:2527 1st Qu.:263.0 1st Qu.:0
## Median :4474 Median :300.0 Median :0
## Mean :4256 Mean :298.3 Mean :0
## 3rd Qu.:5839 3rd Qu.:330.0 3rd Qu.:0
## Max. :8359 Max. :449.0 Max. :0
## NA's :18 NA's :19 NA's :18
## generation.fossil.peat generation.geothermal
## Min. :0 Min. :0
## 1st Qu.:0 1st Qu.:0
## Median :0 Median :0
## Mean :0 Mean :0
## 3rd Qu.:0 3rd Qu.:0
## Max. :0 Max. :0
## NA's :18 NA's :18
## generation.hydro.pumped.storage.aggregated
## Mode:logical
## NA's:35064
##
##
##
##
##
## generation.hydro.pumped.storage.consumption
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 68.0
## Mean : 475.6
## 3rd Qu.: 616.0
## Max. :4523.0
## NA's :19
## generation.hydro.run.of.river.and.poundage generation.hydro.water.reservoir
## Min. : 0.0 Min. : 0
## 1st Qu.: 637.0 1st Qu.:1077
## Median : 906.0 Median :2164
## Mean : 972.1 Mean :2605
## 3rd Qu.:1250.0 3rd Qu.:3757
## Max. :2000.0 Max. :9728
## NA's :19 NA's :18
## generation.marine generation.nuclear generation.other
## Min. :0 Min. : 0 Min. : 0.00
## 1st Qu.:0 1st Qu.:5760 1st Qu.: 53.00
## Median :0 Median :6566 Median : 57.00
## Mean :0 Mean :6264 Mean : 60.23
## 3rd Qu.:0 3rd Qu.:7025 3rd Qu.: 80.00
## Max. :0 Max. :7117 Max. :106.00
## NA's :19 NA's :17 NA's :18
## generation.other.renewable generation.solar generation.waste
## Min. : 0.00 Min. : 0 Min. : 0.0
## 1st Qu.: 73.00 1st Qu.: 71 1st Qu.:240.0
## Median : 88.00 Median : 616 Median :279.0
## Mean : 85.64 Mean :1433 Mean :269.5
## 3rd Qu.: 97.00 3rd Qu.:2578 3rd Qu.:310.0
## Max. :119.00 Max. :5792 Max. :357.0
## NA's :18 NA's :18 NA's :19
## generation.wind.offshore generation.wind.onshore forecast.solar.day.ahead
## Min. :0 Min. : 0 Min. : 0
## 1st Qu.:0 1st Qu.: 2933 1st Qu.: 69
## Median :0 Median : 4849 Median : 576
## Mean :0 Mean : 5464 Mean :1439
## 3rd Qu.:0 3rd Qu.: 7398 3rd Qu.:2636
## Max. :0 Max. :17436 Max. :5836
## NA's :18 NA's :18
## forecast.wind.offshore.eday.ahead forecast.wind.onshore.day.ahead
## Mode:logical Min. : 237
## NA's:35064 1st Qu.: 2979
## Median : 4855
## Mean : 5471
## 3rd Qu.: 7353
## Max. :17430
##
## total.load.forecast total.load.actual price.day.ahead price.actual
## Min. :18105 Min. :18041 Min. : 2.06 Min. : 9.33
## 1st Qu.:24794 1st Qu.:24808 1st Qu.: 41.49 1st Qu.: 49.35
## Median :28906 Median :28901 Median : 50.52 Median : 58.02
## Mean :28712 Mean :28697 Mean : 49.87 Mean : 57.88
## 3rd Qu.:32263 3rd Qu.:32192 3rd Qu.: 60.53 3rd Qu.: 68.01
## Max. :41390 Max. :41015 Max. :101.99 Max. :116.80
## NA's :36
#describe(df)
There are eight columns that contain no values (all NAs or zeroes). These are removed here.
df_clean <- df %>%
select(-c(generation.fossil.coal.derived.gas,
generation.fossil.oil.shale,
generation.fossil.peat,
generation.geothermal,
generation.marine,
generation.hydro.pumped.storage.aggregated,
generation.wind.offshore,
forecast.wind.offshore.eday.ahead))
colSums(is.na(df_clean)) # Number of missing values for each remaining variable
## time
## 0
## generation.biomass
## 19
## generation.fossil.brown.coal.lignite
## 18
## generation.fossil.gas
## 18
## generation.fossil.hard.coal
## 18
## generation.fossil.oil
## 19
## generation.hydro.pumped.storage.consumption
## 19
## generation.hydro.run.of.river.and.poundage
## 19
## generation.hydro.water.reservoir
## 18
## generation.nuclear
## 17
## generation.other
## 18
## generation.other.renewable
## 18
## generation.solar
## 18
## generation.waste
## 19
## generation.wind.onshore
## 18
## forecast.solar.day.ahead
## 0
## forecast.wind.onshore.day.ahead
## 0
## total.load.forecast
## 0
## total.load.actual
## 36
## price.day.ahead
## 0
## price.actual
## 0
16 out of 22 of the remaining variables each have a maximum of 36 missing values. Looking at where these occur, we see that most missing values occur across variables. Specifically, 90% of the missing values occur over only 18 rows. The remaining 10% are spread across 28 rows.
# Rows where data is missing
df_clean[!complete.cases(df_clean),]
# Number of NAs per row
missing_per_row <- rowSums(is.na(df_clean[!complete.cases(df_clean),]))
missing_per_row
## 100 109 110 111 112 113 114 452 453 644 662 752 753
## 14 15 15 15 15 15 15 14 14 14 15 1 1
## 754 757 758 759 760 761 762 763 764 2259 2529 2624 2709
## 1 1 1 1 1 1 1 1 1 1 15 1 15
## 2914 3555 3969 6584 6587 8050 11237 11525 11527 11903 12673 13342 13392
## 1 1 14 1 15 15 1 1 1 1 1 14 1
## 15273 15983 16613 25165 25172 30186 30897
## 1 1 1 1 1 1 15
sum(missing_per_row > 1)
## [1] 18
sum(missing_per_row[missing_per_row > 1])/sum(missing_per_row)
## [1] 0.9041096
What are we left with?
tail(df_clean)
35,000 obs across 20 variables, hourly from start 2015 to end 2018:
Energy generation from 14 different sources
Forecast onshore wind and solar, one day ahead
Forecast and actual total electricity load
Forecast and actual electricity price
And in the other dataset, not yet loaded: hourly weather data over the same period for a number of Spanish cities
# Formatting time variable to datetime object
# NB: Time contains both UTC+1 and UTC+2 time-zones - date functions automatically pick this up and convert to UTC
df_clean$time <- as_datetime(df_clean$time)
head(df_clean)
gen_data <- df_clean %>%
select(c(1, starts_with("generation")))
# Converting to long form
long_gen_data <- gen_data %>%
pivot_longer(cols = -time, names_to = "generation_source") %>%
mutate(generation_source = str_replace(generation_source, "generation.", ""))
head(long_gen_data)
Plot distribution of (generation) data missingness
# NB: MAYBE DON'T DO WHOLE DATASET LOCALLY, IT CRASHED MY R SESSION
long_gen_data_small <- long_gen_data[1:10000,]
ggplot_na_distribution(x=long_gen_data_small$value, y=long_gen_data_small$time)
Smoothing the data on a daily scale
gen_data_smoothed <- long_gen_data %>%
group_by(generation_source) %>%
summarise_by_time(time, .by = 'day', adjusted = mean(value))
head(gen_data_smoothed)
Looking at four series over time in isolation, with quarterly smoothing (This function is nice as it includes an interactive feature)
gen_data_smoothed %>% filter(generation_source %in% c(
"biomass", "fossil.gas", "nuclear", "solar")) %>%
plot_time_series(time, adjusted, .facet_ncol = 2,
.smooth_period = "3 months")
long_gen_data %>%
group_by(generation_source) %>%
summarise_by_time(time, .by = '2 months', adjusted = mean(value)) %>%
ggplot() +
geom_area(aes(x = time, y = adjusted, fill = generation_source), stat = "identity") +
#geom_line(data = df_clean, aes(x = time, y = total.load.actual)) +
theme_minimal()
## Warning: Removed 101 rows containing missing values (position_stack).
long_gen_data$year <- year(long_gen_data$time)
long_gen_data$month <- month(long_gen_data$time)
long_gen_data$day <- day(long_gen_data$time)
long_gen_data$hour <- hour(long_gen_data$time)
hourly_generation_plot_by_year <- long_gen_data %>%
ggplot(aes(x = hour, y = value, fill = generation_source)) +
geom_bar(stat = "identity")
hourly_generation_plot_by_year
## Warning: Removed 256 rows containing missing values (position_stack).
daily_generation_plot_by_year <- long_gen_data %>%
ggplot(aes(x = day, y = value, fill = generation_source)) +
geom_bar(stat = "identity") +
facet_wrap(~year) + theme(legend.position = "none")
daily_generation_plot_by_year
## Warning: Removed 256 rows containing missing values (position_stack).
monthly_generation_plot <- long_gen_data %>%
ggplot(aes(x = month, y = value, fill = generation_source)) +
geom_bar(stat = "identity") +
facet_wrap(~year)
monthly_generation_plot
## Warning: Removed 256 rows containing missing values (position_stack).
```